{
 "cells": [
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### Outlier Detection with `bqplot`\n",
    "---"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "In this notebook, we create a class `DNA` that leverages the new bqplot canvas based [HeatMap](https://github.com/bloomberg/bqplot/blob/master/examples/Marks/HeatMap.ipynb) along with the ipywidgets Range Slider to help us detect and clean outliers in our data. The class accepts a DataFrame and allows you to visually and programatically filter your outliers. The cleaned DataFrame can then be retrieved through a simple convenience function."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "from bqplot import (DateScale, ColorScale, HeatMap, \n",
    "                    Figure, LinearScale, OrdinalScale, Axis)\n",
    "from scipy.stats import percentileofscore\n",
    "from scipy.interpolate import interp1d\n",
    "import bqplot.pyplot as plt\n",
    "from traitlets import List, Float, observe\n",
    "from ipywidgets import IntRangeSlider, Layout, VBox, HBox, jslink\n",
    "from pandas import DatetimeIndex\n",
    "import numpy as np\n",
    "import pandas as pd\n",
    "\n",
    "def quantile_space(x, q1=0.1, q2=0.9):\n",
    "    '''\n",
    "    Returns a function that squashes quantiles between q1 and q2\n",
    "    '''\n",
    "    q1_x, q2_x = np.percentile(x, [q1, q2])\n",
    "    qs = np.percentile(x, np.linspace(0, 100, 100))\n",
    "    def get_quantile(t):\n",
    "        return np.interp(t, qs, np.linspace(0, 100, 100))\n",
    "    def f(y):\n",
    "        return np.interp(get_quantile(y), [0, q1, q2, 100], [-1, 0, 0, 1])\n",
    "    return f\n",
    "\n",
    "class DNA(VBox):\n",
    "    \n",
    "    colors = List()\n",
    "    q1 = Float()\n",
    "    q2 = Float()\n",
    "    \n",
    "    def __init__(self, data, **kwargs):\n",
    "        self.data = data\n",
    "        date_x, date_y = False, False\n",
    "        transpose = kwargs.pop('transpose', False)\n",
    "        if transpose is True:\n",
    "            if type(data.index) is DatetimeIndex:\n",
    "                self.x_scale = DateScale()\n",
    "            if type(data.columns) is DatetimeIndex:\n",
    "                self.y_scale = DateScale()\n",
    "            x, y = list(data.columns.values), data.index.values\n",
    "        else:\n",
    "            if type(data.index) is DatetimeIndex:\n",
    "                date_x = True\n",
    "            if type(data.columns) is DatetimeIndex:\n",
    "                date_y = True\n",
    "            x, y = data.index.values, list(data.columns.values)\n",
    "            \n",
    "        self.q1, self.q2 = kwargs.pop('quantiles', (1, 99))\n",
    "        \n",
    "        self.quant_func = quantile_space(self.data.values.flatten(), q1=self.q1, q2=self.q2)\n",
    "        self.colors = kwargs.pop('colors', ['Red', 'Black', 'Green'])\n",
    "        \n",
    "        self.x_scale = DateScale() if date_x is True else LinearScale()\n",
    "        self.y_scale = DateScale() if date_y is True else OrdinalScale(padding_y=0)\n",
    "        self.color_scale = ColorScale(colors=self.colors)\n",
    "        self.heat_map = HeatMap(color=self.quant_func(self.data.T), x=x, y=y, scales={'x': self.x_scale, 'y': self.y_scale,\n",
    "                                                                               'color': self.color_scale})\n",
    "        self.x_ax = Axis(scale=self.x_scale)\n",
    "        self.y_ax = Axis(scale=self.y_scale, orientation='vertical')\n",
    "        show_axes = kwargs.pop('show_axes', True)\n",
    "        self.axes = [self.x_ax, self.y_ax] if show_axes is True else []\n",
    "        \n",
    "        self.height = kwargs.pop('height', '800px')\n",
    "        self.layout = kwargs.pop('layout', Layout(width='100%', height=self.height, flex='1'))\n",
    "        self.fig_margin = kwargs.pop('fig_margin', {'top': 60, 'bottom': 60, 'left': 150, 'right': 0})\n",
    "        kwargs.setdefault('padding_y', 0.0)\n",
    "        \n",
    "        self.create_interaction(**kwargs)\n",
    "        \n",
    "        self.figure = Figure(marks=[self.heat_map], axes=self.axes, fig_margin=self.fig_margin, \n",
    "                             layout=self.layout, min_aspect_ratio=0.,**kwargs)\n",
    "        \n",
    "        super(VBox, self).__init__(children=[self.range_slider, self.figure], layout=Layout(align_items='center',\n",
    "                                                                                           width='100%',\n",
    "                                                                                           height='100%'),\n",
    "                                   **kwargs)\n",
    "        \n",
    "    def create_interaction(self, **kwargs):\n",
    "        self.range_slider = IntRangeSlider(description='Filter Range', value=(self.q1, self.q2), layout=Layout(width='100%'))\n",
    "        self.range_slider.observe(self.slid_changed, 'value')\n",
    "        self.observe(self.changed, ['q1', 'q2'])\n",
    "        \n",
    "        \n",
    "    def slid_changed(self, new):\n",
    "        self.q1 = self.range_slider.value[0]\n",
    "        self.q2 = self.range_slider.value[1]\n",
    "        \n",
    "    def changed(self, new):\n",
    "        self.range_slider.value = (self.q1, self.q2)\n",
    "        \n",
    "        self.quant_func = quantile_space(self.data.values.flatten(), q1=self.q1, q2=self.q2)\n",
    "        self.heat_map.color = self.quant_func(self.data.T)\n",
    "        \n",
    "    def get_filtered_df(self, fill_type='median'):\n",
    "        q1_x, q2_x = np.percentile(self.data, [self.q1, self.q2])\n",
    "        if fill_type == 'median':\n",
    "            return self.data[(self.data >= q1_x) & (self.data <= q2_x)].apply(lambda x: x.fillna(x.median()))\n",
    "        elif fill_type == 'mean':\n",
    "            return self.data[(self.data >= q1_x) & (self.data <= q2_x)].apply(lambda x: x.fillna(x.mean()))\n",
    "        else:\n",
    "            raise ValueError(\"fill_type must be one of ('median', 'mean')\")"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "We define the size of our matrix here. Larger matrices require a larger height."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "size = 100"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "def num_to_col_letters(num):\n",
    "    letters = ''\n",
    "    while num:\n",
    "        mod = (num - 1) % 26\n",
    "        letters += chr(mod + 65)\n",
    "        num = (num - 1) // 26\n",
    "    return ''.join(reversed(letters))\n",
    "\n",
    "letters = []\n",
    "\n",
    "for i in range(1, size+1):\n",
    "    letters.append(num_to_col_letters(i))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "data = pd.DataFrame(np.random.randn(size, size), columns=letters)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "data_dna = DNA(data, title='DNA of our Data', height='1400px', colors=['Red', 'White', 'Green'])\n",
    "data_dna"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "Instead of setting the quantiles by the sliders, we can also set them programatically. Using a range of (5, 95) restricts the data considerably."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "data_dna.q1, data_dna.q2 = 5, 95"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "Now, we can use the convenience function to extract a clean DataFrame."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "scrolled": true
   },
   "outputs": [],
   "source": [
    "data_clean = data_dna.get_filtered_df()"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "The DNA fills outliers with the mean of the column. Alternately, we can fill the outliers by the mean."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "data_mean = data_dna.get_filtered_df(fill_type='mean')"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "We can also visualize the new DataFrame the same way to test how our outliers look now."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "DNA(data_clean, title='Cleaned Data', height='1200px', colors=['Red', 'White', 'Green'])"
   ]
  }
 ],
 "metadata": {
  "anaconda-cloud": {},
  "kernelspec": {
   "display_name": "Python 3",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.7.6"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 2
}
